This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ###load packages
install.packages(“ggplot2”) install.packages(“data.table”) install.packages(“ggrepel”)
library(ggplot2)
library(reshape2)
library(ggrepel)
#require(data.table)
datadir <- "~/Desktop/Lab\ research/Barseq_SGA/N-lim-4mutants/seqdata/"
dir(datadir)
## [1] "barseq_Nlim_cache"
## [2] "barseq_Nlim.Rmd"
## [3] "filterData (0MM) without zero counts sample.csv"
## [4] "GeneInter_N_lim.csv"
## [5] "n.barnone.0MM.counts.csv"
## [6] "n.barnone.0MM.counts.txt"
## [7] "n.barnone.1MM.counts.csv"
## [8] "n.barnone.1MM.counts.txt"
## [9] "n.barnone.2MM.counts.csv"
## [10] "n.barnone.2MM.counts.txt"
## [11] "ScreenCounts201605050MM.csv"
## [12] "ScreenCounts201605050MMMelted.csv"
## [13] "ScreenCounts201605051MM.csv"
## [14] "ScreenCounts201605051MMMelted.csv"
## [15] "ScreenCounts201605052MM.csv"
## [16] "ScreenCounts201605052MMMelted.csv"
## [17] "strain counts distribution"
index <- read.csv(paste0(datadir, "GeneInter_N_lim.csv"))
rownames(index)<-index$SampleIndex
head(index)
## Index LibraryName SampleIndex StarvingMedia Replicate
## Sample1 1 HO Sample1 N-lim 1
## Sample7 2 HO Sample7 N-lim 1
## Sample13 3 HO Sample13 N-lim 1
## Sample19 4 HO Sample19 N-lim 1
## Sample25 5 HO Sample25 N-lim 1
## Sample31 6 HO Sample31 N-lim 1
## SamplingTime PrimerIndex
## Sample1 0h 1
## Sample7 9h 7
## Sample13 14h 13
## Sample19 18h 19
## Sample25 24h 25
## Sample31 32h 31
Here we make a list of data.frames, where each data.frame is just the counts table of a different Barnone run, with each a different mismatch allowed.
allFileNames <- dir(datadir)[grepl("counts.*txt", dir(datadir))]
rawCountsList <- list()
for (fz in allFileNames) {
numberMismatches <- sub(".*(\\dMM).*","\\1",fz)
rawCountsList[[numberMismatches]] <- read.table(paste0(datadir,fz),header=T)
}
rawCountsList[[1]][1:5,1:5]
## Strain Sample9_UP Sample9_DOWN Sample111_UP Sample111_DOWN
## 1 YAL008W 0 0 0 0
## 2 YBR255W 472 0 71 0
## 3 YGR164W 0 66 0 546
## 4 YGR131W 0 0 0 0
## 5 YNL003C 1 0 348 0
#Below, we melt the data into a tidy long data.frame:
datar <- list()
for (MM in names(rawCountsList)) {
mcounts <- melt(rawCountsList[[MM]], id.vars="Strain", value.name="Counts")
mcounts[,ncol(mcounts)+1:2] <- colsplit(mcounts$variable,"_", c("SampleNum","Tag"))
#aggregate counts in each sample collection -- from same set of genomic DNA
sampleTotalCounts <- aggregate(Counts~variable,data=mcounts,sum)
rownames(sampleTotalCounts) <- sampleTotalCounts$variable
#generate the total counts for each sample collection
mcounts$TotalCounts <- sampleTotalCounts[mcounts$variable,"Counts"]
#ratio of certain strain counts over total counts in each sample
mcounts$RelativeCounts <- mcounts$Counts / mcounts$TotalCounts
mcounts<-subset(mcounts, mcounts$SampleNum %in% index$SampleIndex)
mcounts[,ncol(mcounts)+1:7] <- index[mcounts$SampleNum,
c("Index","LibraryName","SampleIndex","StarvingMedia","Replicate","SamplingTime","PrimerIndex")]
datar[[MM]] <- mcounts[,c("Strain","SampleNum","Tag",
"Index","LibraryName","SampleIndex","StarvingMedia","Replicate","SamplingTime","PrimerIndex",
"Counts","RelativeCounts")]
}
#A sample of what that looks like:
datar[[1]][1:5,1:10]
## Strain SampleNum Tag Index LibraryName SampleIndex StarvingMedia
## 1 YAL008W Sample9 UP 22 HO Sample9 N-lim
## 2 YBR255W Sample9 UP 22 HO Sample9 N-lim
## 3 YGR164W Sample9 UP 22 HO Sample9 N-lim
## 4 YGR131W Sample9 UP 22 HO Sample9 N-lim
## 5 YNL003C Sample9 UP 22 HO Sample9 N-lim
## Replicate SamplingTime PrimerIndex
## 1 3 9h 9
## 2 3 9h 9
## 3 3 9h 9
## 4 3 9h 9
## 5 3 9h 9
#select my data from the whole sequencing library mixture
mydatar<-list()
for (MM in names(datar)) {
mydatar[[MM]] <- subset(datar[[MM]],datar[[MM]]$SampleNum%in%
index$SampleIndex)
}
mydatar[[1]][1:5,1:10]
## Strain SampleNum Tag Index LibraryName SampleIndex StarvingMedia
## 1 YAL008W Sample9 UP 22 HO Sample9 N-lim
## 2 YBR255W Sample9 UP 22 HO Sample9 N-lim
## 3 YGR164W Sample9 UP 22 HO Sample9 N-lim
## 4 YGR131W Sample9 UP 22 HO Sample9 N-lim
## 5 YNL003C Sample9 UP 22 HO Sample9 N-lim
## Replicate SamplingTime PrimerIndex
## 1 3 9h 9
## 2 3 9h 9
## 3 3 9h 9
## 4 3 9h 9
## 5 3 9h 9
#So which mismatch tolerance to use? The assumption is that some barcodes ain’t perfect, and that Barnone will find the right one within a mismatch parameter.
#Which one to use?
#Presumably there’s only real barcodes in the library, so I should increase the parameter to the point where I get more reads per strain, but not at the point where strains start to canabalize counts from other strains.
summedCounts <- list()
for (MM in names(mydatar)) {
summedCounts[[MM]] <- aggregate(Counts~Strain,FUN=sum,
data=subset(mydatar[[MM]], Replicate!=""))
}
allSummedCounts <- data.table::rbindlist(summedCounts,idcol=T)
dallSummedCounts <- dcast(data=allSummedCounts,Strain~.id,value.var="Counts")
g <- ggplot(dallSummedCounts)+
scale_y_log10()+scale_x_log10()+theme_bw()+
geom_point(size=0.1)
g+aes(x=`0MM`,y=`1MM`)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
g+aes(x=`1MM`,y=`2MM`)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
pdf("strain counts distribution",width=8,height=6)
dallSummedCounts_m<-melt(dallSummedCounts, id.var="Strain")
ggplot(dallSummedCounts_m,aes(x=value))+
theme_bw()+scale_x_log10()+
ylab("Strain counts distribution")+
geom_density(data=subset(dallSummedCounts_m,variable=="0MM"), aes(colour="0MM"))+
geom_density(data=subset(dallSummedCounts_m,variable=="1MM"), aes(colour="1MM"))+
geom_density(data=subset(dallSummedCounts_m,variable=="2MM"), aes(colour="2MM"))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1996 rows containing non-finite values (stat_density).
## Warning: Removed 1866 rows containing non-finite values (stat_density).
## Warning: Removed 1650 rows containing non-finite values (stat_density).
dev.off()
## quartz_off_screen
## 2
g <- ggplot(dallSummedCounts)+theme_bw()+
scale_x_log10()+geom_histogram(bins=50)
g+aes(x=`0MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1996 rows containing non-finite values (stat_bin).
g+aes(x=`1MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1866 rows containing non-finite values (stat_bin).
g+aes(x=`2MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1650 rows containing non-finite values (stat_bin).
#(interpretation)
#And what about the mismatches of sample indicies?
summedBySample <- list()
for (MM in names(mydatar)) {
summedBySample[[MM]] <- aggregate(Counts~SampleNum+Tag,FUN=sum,
data=mydatar[[MM]])
}
#
allSummedBySample <- data.table::rbindlist(summedBySample,idcol=T)
allSummedBySample$Used <- ifelse(allSummedBySample$SampleNum%in%index$SampleIndex,"Yep","Nope")
#mySummedSample<- subset(allSummedBySample,allSummedBySample$SampleNum%in%
# paste("Sample",rownames(index),sep=""))
##this step seems not necessary for the samples mixed from different experiment
#only plot my sample
allSummedBySampleUP<-subset(allSummedBySample, allSummedBySample$Tag == "UP")
ggplot(allSummedBySampleUP)+theme_bw()+
aes(x=SampleNum,y=log10(Counts),col=Used)+facet_grid(.id~Tag)+
geom_point()+#scale_y_log10()+
theme(axis.text.x=element_text(angle=90,size=5,face="bold"),legend.position="top")
allSummedBySampleDOWN<-subset(allSummedBySample, allSummedBySample$Tag == "DOWN")
ggplot(allSummedBySampleDOWN)+theme_bw()+
aes(x=SampleNum,y=log10(Counts),col=Used)+facet_grid(.id~Tag)+
geom_point()+#scale_y_log10()+
theme(axis.text.x=element_text(angle=90,size=5,face="bold"),legend.position="top")
#saving files
for (MM in names(mydatar)) {
write.csv(file=paste0("N_lim4",MM,"Melted.csv"),
x=subset(mydatar[[MM]],Replicate!=""))
}
for (MM in names(mydatar)) {
write.csv(file=paste0("N_lim4",MM,".csv"),
x=dcast(subset(mydatar[[MM]],Replicate!=""),#,"Starvation","SamplingIndex","Replicate","SamplingTime","BarTag"
Strain+Tag~SampleNum+LibraryName+StarvingMedia+SampleIndex+Replicate+SamplingTime,
value.var="Counts"))
}
useMM <- "0MM"
#sdat <- subset(mydatar[[useMM]], Replicate!="")
sdat <- subset(mydatar[[useMM]], SampleNum %in% index$SampleIndex)
sdat[,-c((-1:0)+ncol(sdat))] <- lapply(sdat[,-c((-1:0)+ncol(sdat))],
factor)
ggplot(sdat)+
aes(x=SampleNum,
col=LibraryName:Replicate:SamplingTime,weight=Counts)+
facet_grid(Tag~.)+geom_point(stat="count")+
theme(axis.text.x=element_text(angle=90))+
scale_y_log10()
ggplot(sdat)+
aes(x=SampleNum:Tag,y=Counts)+
geom_boxplot()+scale_y_log10()+
facet_grid(LibraryName~SamplingTime+Replicate,
scales="free_x",space="free")+
theme(axis.text.x=element_text(angle=90))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 800752 rows containing non-finite values (stat_boxplot).
#Indexing numeric data: values will be adjusted so they are equal to each other in a given starting time period.
nozerosdat<-subset(sdat,sdat$Counts>0)
##for (i in unique(nozerosdat$Strain)){
# s<-subset(nozerosdat, nozerosdat$Strain==i)
# apply(s, 1, function(x){return x})}
zeroMM<-subset(allSummedBySample, allSummedBySample$.id=="0MM")
zeroMM_sum<-aggregate(zeroMM$Counts ~ zeroMM$SampleNum, FUN = sum)
Sample_0MM<-allSummedBySample[allSummedBySample$Counts>5e4 & allSummedBySample$.id=="0MM"]
filterData<-subset(sdat, sdat$SampleNum %in% Sample_0MM$SampleNum)
ssdat<-subset(sdat,sdat$Counts >0 & sdat$SampleNum %in% Sample_0MM$SampleNum)
dsdat <- dcast(Strain~SampleNum+Tag+LibraryName+Replicate+SamplingTime+StarvingMedia,
data=ssdat,value.var="Counts")
dsdat[is.na(dsdat)] <- 0
rownames(dsdat) <- dsdat[,1]
dsdat <- dsdat[,-1]
pc <- prcomp(t(dsdat))
pz <- data.frame(pc$x)
pz[,ncol(pz)+(1:6)] <- colsplit(rownames(pz),"_",
names=c("SampleNum","Tag","LibraryName","Replicate","SamplingTime","StarvingMedia"))
pz[ncol(pz)-(0:5)] <- lapply(pz[ncol(pz)-(0:5)],factor)
######Plot PCA vs UP and DN tag
g <- ggplot(pz)+geom_point()
g+aes(x=PC1,y=PC2,col=Tag)
g+facet_wrap(~Tag,scales="free")+
aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
theme(legend.position="bottom")
########Plot PCA seperated by tag and library
g <- ggplot(pz)+
geom_point()+theme(legend.position="bottom")
g+facet_wrap(~Tag,scales="free")+
aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)
g+facet_wrap(~LibraryName+Tag,scales="free")+
aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)
########Plot PCA seperated by sampling time
g <- ggplot(pz)+
geom_point()+theme(legend.position="bottom")
g+facet_grid(Tag~SamplingTime,scales="free")+
aes(x=PC1,y=PC2,col=Replicate)+
geom_text_repel(aes(label=LibraryName),size=3,alpha=0.5,nudge_y=200)
#g+facet_wrap(Tag~LibraryName+SamplingTime,scales="free")+
#plot for PC3vsPC4 -- the less representative components for the data set
g+facet_wrap(~Tag,scales="free")+
aes(x=PC3,y=PC4,col=LibraryName:SamplingTime:Replicate)+
geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)
g+facet_wrap(~LibraryName+Tag,scales="free")+
aes(x=PC3,y=PC4,col=LibraryName:SamplingTime:Replicate)+
geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)
#ggplot(mydatar)+theme_bw()+aes(x=(sumz))+
# geom_histogram(binwidth=50)+facet_grid(time~pulse+replicate)+
# xlim(0,4e3)